require(Hmisc)
knitrSet(lang='markdown')

Descriptive Statistics

require(rms)
getHdata(FEV)
contents(FEV)

Data frame:FEV  654 observations and 6 variables    Maximum # NAs:0

        Units Levels Storage
id                   integer
age     years        integer
fev    liters         double
height inches         double
sex                2 integer
smoke              2 integer

+--------+---------------------------------+
|Variable|Levels                           |
+--------+---------------------------------+
|  sex   |female,male                      |
+--------+---------------------------------+
|  smoke |non-current smoker,current smoker|
+--------+---------------------------------+
describe(FEV)
FEV 

 6  Variables      654  Observations
---------------------------------------------------------------------------
id 
      n missing  unique    Info    Mean     .05     .10     .25     .50 
    654       0     654       1   37170    3142    6162   15811   36071 
    .75     .90     .95 
  53638   73342   77706 

lowest :   201   202   301   341   351
highest: 83841 83901 83951 83952 90001 
---------------------------------------------------------------------------
age [years] 
      n missing  unique    Info    Mean     .05     .10     .25     .50 
    654       0      17   0.988   9.931       5       6       8      10 
    .75     .90     .95 
     12      14      15 

lowest :  3  4  5  6  7, highest: 15 16 17 18 19 
                                                                      
Value          3     4     5     6     7     8     9    10    11    12
Frequency      2     9    28    37    54    85    94    81    90    57
Proportion 0.003 0.014 0.043 0.057 0.083 0.130 0.144 0.124 0.138 0.087
                                                    
Value         13    14    15    16    17    18    19
Frequency     43    25    19    13     8     6     3
Proportion 0.066 0.038 0.029 0.020 0.012 0.009 0.005
---------------------------------------------------------------------------
fev [liters] 
      n missing  unique    Info    Mean     .05     .10     .25     .50 
    654       0     575       1   2.637   1.445   1.612   1.981   2.547 
    .75     .90     .95 
  3.119   3.813   4.289 

lowest : 0.791 0.796 0.839 1.004 1.072
highest: 5.102 5.224 5.633 5.638 5.793 
---------------------------------------------------------------------------
height [inches] 
      n missing  unique    Info    Mean     .05     .10     .25     .50 
    654       0      56   0.999   61.14    51.0    53.0    57.0    61.5 
    .75     .90     .95 
   65.5    68.5    70.0 

lowest : 46.0 46.5 47.0 48.0 49.0, highest: 72.0 72.5 73.0 73.5 74.0 
---------------------------------------------------------------------------
sex 
      n missing  unique 
    654       0       2 

female (318, 0.486), male (336, 0.514)
---------------------------------------------------------------------------
smoke 
      n missing  unique 
    654       0       2 

non-current smoker (589, 0.901), current smoker (65, 0.099)
---------------------------------------------------------------------------
ggplot(FEV, aes(x=age, y=fev, color=height)) + geom_point() + facet_grid(smoke ~ sex)
dd <- datadist(FEV); options(datadist='dd')
dd
                        id age      fev height    sex              smoke
Low:effect      15811.0000   8 1.981000   57.0   <NA>               <NA>
Adjust to       36071.0000  10 2.547500   61.5   male non-current smoker
High:effect     53638.5000  12 3.118500   65.5   <NA>               <NA>
Low:prediction    600.2355   4 1.252128   49.0 female non-current smoker
High:prediction 81741.1529  17 4.789642   72.0   male     current smoker
Low               201.0000   3 0.791000   46.0 female non-current smoker
High            90001.0000  19 5.793000   74.0   male     current smoker

Values:

sex : female male 
smoke : non-current smoker current smoker 

Regression Models of Increasing Complexity

FEV is log-transformed to satisfy normality and equal variance assumptions.

Single Predictor, Linear Effect

f <- ols(log(fev) ~ age, data=FEV)
f

Linear Regression Model

ols(formula = log(fev) ~ age, data = FEV)
                Model Likelihood     Discrimination    
                   Ratio Test           Indexes        
Obs      654    LR chi2    592.40    R2       0.596    
sigma 0.2120    d.f.            1    R2 adj   0.595    
d.f.     652    Pr(> chi2) 0.0000    g        0.287    

Residuals

     Min       1Q   Median       3Q      Max 
-0.72047 -0.13752  0.00302  0.14681  0.60267 

          Coef   S.E.   t     Pr(>|t|)
Intercept 0.0506 0.0291  1.74 0.0826  
age       0.0871 0.0028 31.00 <0.0001 
r <- as.numeric(resid(f))
with(FEV, plot(age, r))
qqnorm(r)
qqline(r)

# Show algebraic form of fitted model
Function(f)
function (age = 10) 
{
    0.050596002 + 0.087083296 * age
}
<environment: 0x88b0db8>
g <- Function(f)
exp(g()); exp(g(age=10))   # Evaluate the fitted model, antilog to get original scale
[1] 2.512879
[1] 2.512879
ggplot(Predict(f)) + geom_point(aes(x=age, y=log(fev), color=sex), FEV)

Restricted Cubic Spline With 3 Default Knots

The knots are at these quantiles of age: 0.1, 0.5, 0.9.

f <- ols(log(fev) ~ rcs(age, 3), data=FEV, x=TRUE)
f

Linear Regression Model

ols(formula = log(fev) ~ rcs(age, 3), data = FEV, x = TRUE)
                Model Likelihood     Discrimination    
                   Ratio Test           Indexes        
Obs      654    LR chi2    668.33    R2       0.640    
sigma 0.2002    d.f.            2    R2 adj   0.639    
d.f.     651    Pr(> chi2) 0.0000    g        0.297    

Residuals

      Min        1Q    Median        3Q       Max 
-0.608234 -0.134592  0.006764  0.139165  0.538798 

          Coef    S.E.   t     Pr(>|t|)
Intercept -0.3377 0.0513 -6.58 <0.0001 
age        0.1386 0.0063 21.88 <0.0001 
age'      -0.0627 0.0070 -8.95 <0.0001 
g <- Function(f)
g(10:20)
 [1] 0.9852999 1.0660775 1.1292243 1.1806172 1.2261332 1.2706697 1.3152062
 [8] 1.3597427 1.4042792 1.4488157 1.4933522
g()
[1] 0.9852999
g
function (age = 10) 
{
    -0.33768719 + 0.13856744 * age - 0.00097948891 * pmax(age - 
        6, 0)^3 + 0.0019589778 * pmax(age - 10, 0)^3 - 0.00097948891 * 
        pmax(age - 14, 0)^3
}
<environment: 0x9aaf7c0>
ggplot(Predict(f))
anova(f)
                Analysis of Variance          Response: log(fev) 

 Factor     d.f. Partial SS MS          F      P     
 age          2  46.42323   23.21161744 578.90 <.0001
  Nonlinear   1   3.21318    3.21318002  80.14 <.0001
 REGRESSION   2  46.42323   23.21161744 578.90 <.0001
 ERROR      651  26.10268    0.04009628              

RCS With 5 Default Knots

f <- ols(log(fev) ~ rcs(age, 5), data=FEV)
f

Linear Regression Model

ols(formula = log(fev) ~ rcs(age, 5), data = FEV)
                Model Likelihood     Discrimination    
                   Ratio Test           Indexes        
Obs      654    LR chi2    678.99    R2       0.646    
sigma 0.1989    d.f.            4    R2 adj   0.644    
d.f.     649    Pr(> chi2) 0.0000    g        0.303    

Residuals

      Min        1Q    Median        3Q       Max 
-0.621716 -0.135183  0.008372  0.144535  0.543184 

          Coef    S.E.   t     Pr(>|t|)
Intercept -0.1758 0.0985 -1.78 0.0748  
age        0.1126 0.0159  7.10 <0.0001 
age'       0.0383 0.0817  0.47 0.6391  
age''     -0.2167 0.4077 -0.53 0.5951  
age'''    -0.1496 0.9059 -0.17 0.8689  
Function(f)
function (age = 10) 
{
    -0.17580202 + 0.11261215 * age + 0.0003834753 * pmax(age - 
        5, 0)^3 - 0.0021674701 * pmax(age - 8, 0)^3 - 0.0014960677 * 
        pmax(age - 10, 0)^3 + 0.0047044691 * pmax(age - 11, 0)^3 - 
        0.0014244066 * pmax(age - 15, 0)^3
}
<environment: 0x86abf50>
anova(f)
                Analysis of Variance          Response: log(fev) 

 Factor     d.f. Partial SS MS          F      P     
 age          4  46.845368  11.71134190 295.97 <.0001
  Nonlinear   3   3.635313   1.21177091  30.62 <.0001
 REGRESSION   4  46.845368  11.71134190 295.97 <.0001
 ERROR      649  25.680547   0.03956941              
ggplot(Predict(f))

RCS with 5 Knots, Additive (Non-Interacting) Sex Effect

f <- ols(log(fev) ~ rcs(age, 5) + sex, data=FEV)
Function(f)
function (age = 10, sex = "male") 
{
    -0.23344699 + 0.11429274 * age + 0.00019963206 * pmax(age - 
        5, 0)^3 - 0.0010413455 * pmax(age - 8, 0)^3 - 0.0043083413 * 
        pmax(age - 10, 0)^3 + 0.0067087011 * pmax(age - 11, 0)^3 - 
        0.0015586464 * pmax(age - 15, 0)^3 + 0.099501651 * (sex == 
        "male")
}
<environment: 0xc00d218>
anova(f)
                Analysis of Variance          Response: log(fev) 

 Factor     d.f. Partial SS MS         F      P     
 age          4  46.370003  11.5925008 312.11 <.0001
  Nonlinear   3   3.675485   1.2251616  32.99 <.0001
 sex          1   1.612016   1.6120158  43.40 <.0001
 REGRESSION   5  48.457383   9.6914767 260.92 <.0001
 ERROR      648  24.068532   0.0371428              
ggplot(Predict(f, age, sex)) + geom_point(aes(x=age, y=log(fev), color=sex), FEV)

RCS in Age Interacting With Sex

The following model allows for different shapes of effects for males and females.

f <- ols(log(fev) ~ rcs(age, 5) * sex, data=FEV)
r <- as.numeric(resid(f))
plot(fitted(f), r)
qqnorm(r); qqline(r)
Function(f)
function (age = 10, sex = "male") 
{
    -0.33458936 + 0.13366998 * age - 2.9172721e-05 * pmax(age - 
        5, 0)^3 - 0.0039056026 * pmax(age - 8, 0)^3 + 0.0078871147 * 
        pmax(age - 10, 0)^3 - 0.002951157 * pmax(age - 11, 0)^3 - 
        0.0010011823 * pmax(age - 15, 0)^3 + 0.34502774 * (sex == 
        "male") + (sex == "male") * (-0.046273816 * age + 0.00093632418 * 
        pmax(age - 5, 0)^3 + 0.0034513932 * pmax(age - 8, 0)^3 - 
        0.020568634 * pmax(age - 10, 0)^3 + 0.017330044 * pmax(age - 
        11, 0)^3 - 0.0011491273 * pmax(age - 15, 0)^3)
}
<environment: 0x73905b8>
anova(f)
                Analysis of Variance          Response: log(fev) 

 Factor                                   d.f. Partial SS MS        
 age  (Factor+Higher Order Factors)         8  48.3058462 6.03823078
  All Interactions                          4   1.9358432 0.48396079
  Nonlinear (Factor+Higher Order Factors)   6   4.7393834 0.78989723
 sex  (Factor+Higher Order Factors)         5   3.5478589 0.70957178
  All Interactions                          4   1.9358432 0.48396079
 age * sex  (Factor+Higher Order Factors)   4   1.9358432 0.48396079
  Nonlinear                                 3   0.8210779 0.27369262
  Nonlinear Interaction : f(A,B) vs. AB     3   0.8210779 0.27369262
 TOTAL NONLINEAR                            6   4.7393834 0.78989723
 TOTAL NONLINEAR + INTERACTION              7   5.6113278 0.80161826
 REGRESSION                                 9  50.3932265 5.59924739
 ERROR                                    644  22.1326884 0.03436753
 F      P     
 175.70 <.0001
  14.08 <.0001
  22.98 <.0001
  20.65 <.0001
  14.08 <.0001
  14.08 <.0001
   7.96 <.0001
   7.96 <.0001
  22.98 <.0001
  23.32 <.0001
 162.92 <.0001
              

ggplot(Predict(f, age, sex))

Plot predicted median FEV by anti-logging predicted values.

ggplot(Predict(f, age, sex, fun=exp), ylab='Estimated Median FEV')

RCS with Age * Sex and Additive Nonlinear Effect of Height

We show the joint age and height effects using a color image plot. A silly x-axis label is used to show LaTeX math mode.

f <- ols(log(fev) ~ rcs(age, 5) * sex + rcs(height, 5), data=FEV)
anova(f)
                Analysis of Variance          Response: log(fev) 

 Factor                                   d.f. Partial SS MS        
 age  (Factor+Higher Order Factors)         8   1.3289091 0.16611364
  All Interactions                          4   0.3636977 0.09092444
  Nonlinear (Factor+Higher Order Factors)   6   0.4358008 0.07263346
 sex  (Factor+Higher Order Factors)         5   0.5969813 0.11939627
  All Interactions                          4   0.3636977 0.09092444
 height                                     4   8.8829270 2.22073175
  Nonlinear                                 3   0.1721358 0.05737861
 age * sex  (Factor+Higher Order Factors)   4   0.3636977 0.09092444
  Nonlinear                                 3   0.3333276 0.11110920
  Nonlinear Interaction : f(A,B) vs. AB     3   0.3333276 0.11110920
 TOTAL NONLINEAR                            9   0.5804472 0.06449413
 TOTAL NONLINEAR + INTERACTION             10   0.5865061 0.05865061
 REGRESSION                                13  59.2761535 4.55970412
 ERROR                                    640  13.2497614 0.02070275
 F      P     
   8.02 <.0001
   4.39 0.0016
   3.51 0.0020
   5.77 <.0001
   4.39 0.0016
 107.27 <.0001
   2.77 0.0408
   4.39 0.0016
   5.37 0.0012
   5.37 0.0012
   3.12 0.0011
   2.83 0.0019
 220.25 <.0001
              
require(plotly)
ggplotly(ggplot(Predict(f, age, sex), xlab='$\\chi^{2}$',
                adj.subtitle=FALSE))
p <- Predict(f, age, height)
bplot(p)

p <- with(p, list(age=unique(age), height=unique(height),
                  Yhat=matrix(yhat, ncol=200)))
plot_ly(p, x=age, y=height, z=Yhat,
        type='heatmap')
plot_ly(p, x=age, y=height, z=Yhat, type='surface')

Model Ignoring Sex and Height But Including Smoking History

By plotting spike histograms showing the smoking-specific age distribution we see that there are almost no very young children who have smoked. This limits the power to detect an age by smoking interaction.

f <- ols(log(fev) ~ rcs(age, 5) + smoke, data=FEV)
ggplotly(ggplot(Predict(f, age, smoke), rdata=FEV))

Computing Environment

R version 3.3.1 (2016-06-21)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04 LTS

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] plotly_3.6.0    rms_4.5-1       SparseM_1.7     Hmisc_3.18-0   
[5] ggplot2_2.1.0   Formula_1.2-1   survival_2.39-5 lattice_0.20-33

R Core Team (2016). R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria. .